# R script for the Final Exam of 19-704
# Replication and extension of the results from Baltagi et al. (2000)

# written by Alexandra Cosseron

# Set working directory
setwd("C:/Users/Alexandra/Documents/R")

# Install relevant libraries and retrieve data
install.packages("Ecdat", repos = "http://lib.stat.cmu.edu/R/CRAN/")
library(Ecdat)
data("Cigar", package = "Ecdat")

# Let's have a first look at the data
cigar <- data.frame(Cigar)
head(cigar)
summary(cigar)

# Creating panel data frame
install.packages("plm", repos = "http://lib.stat.cmu.edu/R/CRAN/")
library(plm)
cigar.p <- pdata.frame(cigar, index = c("state", "year"))
head(cigar.p)

# Part 1 - Replication of the results from Baltagi et al. (2000) -----

# Question 1 --------------------------------
# OLS model --------------------
# Creating the variables
library(mgcv)
cigar.p$logsales <- log(cigar.p$sales)
cigar.p$logprice <- log(cigar.p$price)
cigar.p$logndi <- log(cigar.p$ndi)
cigar.p$logpimin <- log(cigar.p$pimin)
head(cigar.p)

# Create lagged regressor
cigar.p$lag1sales <- lag(cigar.p$logsales, k = 1)
# Check that I did the lagging correctly
head(cigar.p[ , c("lag1sales", "logsales")])
# Run OLS regression with lagged regressor
lag1reg <- lm(logsales ~ lag1sales + logprice + 
                logndi + logpimin,
              data = cigar.p)
summary(lag1reg)

# OLS model with time dummies
tdlag1reg <- lm(logsales ~ lag1sales + logprice + 
                 logndi + logpimin + factor(year),
               data = cigar.p)
summary(tdlag1reg)

# Within Transformation
within <- plm(logsales ~ lag1sales + logprice + 
  logndi + logpimin, data = cigar.p, model = "within")
summary(within)

# Within Transformation bis (to be able to draw qqplot later)
# Method that follows Alex Davis' advice
withinreg <- lm(logsales ~ lag1sales + logprice + logndi +  
                  logpimin + factor(state) + factor(year),
                data = cigar.p)
summary(withinreg)

# Part 2 - Extension ------------------------------------------------

# Question 2 ----------------------------------

# Histogram of the dependent variable
png(filename = "Cosseron_FE_saleshists.png", 
    width = 3000, 
    height = 3000, 
    res = 300)
par(cex = 1.3, 
    mar = c(5, 5, 2, 1))
hist(cigar.p$sales,
     xlab = "Cigarette sales in packs per capita",
     breaks = "FD",
     main = "")
rug(cigar.p$sales, col = rgb(0, 0, 1, 1))
dev.off()

# Histogram of the log transformed dependent variable
png(filename = "Cosseron_FE_logsaleshists.png", 
    width = 3000, 
    height = 3000, 
    res = 300)
par(cex = 1.3, 
    mar = c(5, 5, 2, 1))
hist(cigar.p$logsales,
     xlab = "Log of cigarette sales in packs per capita",
     breaks = "FD",
     main = "")
rug(cigar.p$logsales, col = rgb(0, 0, 1, 1))
dev.off()

# Box-Cox transformation for the dependent variable
library(car)
png(filename = "Cosseron_FE_boxcoxsales.png", 
    width = 3000, 
    height = 3000, 
    res = 300)
par(cex = 1.3, 
    mar = c(5, 5, 2, 1))
bc <- boxCox(sales ~ 1, 
       data = cigar.p,
       plotit = TRUE)
dev.off()

# Find the minimum value of lambda
bc$x[bc$y == max(bc$y)]

# Question 3 ---------------------------------

# Histograms of the independent variables
png(filename = "Cosseron_FE_indephists.png", 
    width = 3000, 
    height = 3000, 
    res = 300)
par(cex = 1.3, 
    mar = c(5, 5, 2, 1), 
    mfrow = c(3, 1),
    cex.axis = 1.5, 
    cex.lab = 1.5)
hist(cigar.p$price,
     xlab = "Price per pack of cigarttes",
     breaks = "FD",
     main = "")
rug(cigar.p$price, col = rgb(0, 0, 1, 1))
hist(cigar.p$ndi,
     xlab = "Per capita disposable income",
     breaks = "FD",
     main = "")
rug(cigar.p$ndi, col = rgb(0, 0, 1, 1))
hist(cigar.p$pimin,
     xlab = "Minimum price in adjoining states per pack of cigarettes",
     breaks = "FD",
     main = "")
rug(cigar.p$pimin, col = rgb(0, 0, 1, 1))
dev.off()

# Histograms of the log-transformed independent variables
png(filename = "Cosseron_FE_logindephists.png", 
    width = 3000, 
    height = 3000, 
    res = 300)
par(cex = 1.3, 
    mar = c(5, 5, 2, 1), 
    mfrow = c(3, 1),
    cex.axis = 1.5, 
    cex.lab = 1.5)
hist(cigar.p$logprice,
     xlab = "Log price per pack of cigarttes",
     breaks = "FD",
     main = "")
rug(cigar.p$logprice, col = rgb(0, 0, 1, 1))
hist(cigar.p$logndi,
     xlab = "Log per capita disposable income",
     breaks = "FD",
     main = "")
rug(cigar.p$logndi, col = rgb(0, 0, 1, 1))
hist(cigar.p$logpimin,
     xlab = "Log minimum price in adjoining states per pack of cigarettes",
     breaks = "FD",
     main = "")
rug(cigar.p$logpimin, col = rgb(0, 0, 1, 1))
dev.off()

# Box-Cox transformation for the independent variable
library(car)
png(filename = "Cosseron_FE_boxcoxindep.png", 
    width = 3000, 
    height = 3000, 
    res = 300)
par(cex = 1.3, 
    mar = c(5, 5, 2, 1), 
    mfrow = c(3, 1),
    cex.axis = 1.5, 
    cex.lab = 1.5)
bc1 <- boxCox(price ~ 1, 
             data = cigar.p,
             plotit = TRUE,
             xlab = "Lambda for the Price variable")
bc2 <- boxCox(ndi ~ 1,
              data = cigar.p,
              plotit = TRUE,
              xlab = "Lambda for the NDI variable")
bc3 <- boxCox(pimin ~ 1,
              data = cigar.p,
              plotit = TRUE,
              xlab = "Lambda for the Pimin variable")
dev.off()

# Find the minimum values of lambda
l1 <- bc1$x[bc1$y == max(bc1$y)]
bc2$x[bc2$y == max(bc2$y)]
l3 <- bc3$x[bc3$y == max(bc3$y)]

# Power transformation of the Price and Pimin variable
cigar.p$pprice <- ((cigar.p$price)^l1 - 1)/l1
cigar.p$ppimin <- ((cigar.p$pimin)^l3 - 1)/l3
head(cigar.p)

png(filename = "Cosseron_FE_powerindep.png", 
    width = 3000, 
    height = 3000, 
    res = 300)
par(cex = 1.3, 
    mar = c(5, 5, 2, 1), 
    mfrow = c(2, 1),
    cex.axis = 1.5, 
    cex.lab = 1.5)
hist(cigar.p$pprice,
     xlab = "Power transformed price per pack of cigarttes",
     breaks = "FD",
     main = "")
rug(cigar.p$pprice, col = rgb(0, 0, 1, 1))
hist(cigar.p$ppimin,
     xlab = "Power transformed minimum price in adjoining states per pack of cigarettes",
     breaks = "FD",
     main = "")
rug(cigar.p$ppimin, col = rgb(0, 0, 1, 1))
dev.off()

# Question 4 --------------------------------

# Q-Q plots for the three models
png(filename = "Cosseron_FE_qqplots.png", 
    width = 3000, 
    height = 3000, 
    res = 300)
par(mfrow = c(1, 3), 
    cex = 1.3, 
    mar = c(5, 4, 4, 1))
# qqplot of the regression residuals from the OLS model
qqPlot(lag1reg,
       main = "OLS model",
       id.n = 3,
       pch = 19,
       col = rgb(0, 0, 0, .5))
abline(0, 1)
# qqplot of the regression residuals from the OLS with time dummies model
qqPlot(tdlag1reg, 
       main = "OLS with time dummies", 
       id.n = 3,
       pch = 19,
       col = rgb(0, 0, 0, .5))
abline(0, 1)
# qqplot of the regression residuals from the within transformation model "bis"
qqPlot(withinreg,
       main = "Within transformation", 
       id.n = 3,
       pch = 19,
       col = rgb(0, 0, 0, .5))
abline(0, 1)
dev.off()

# Influence Index plots for the three models
png(filename = "Cosseron_FE_influenceindex1.png", 
    width = 3000, 
    height = 3000, 
    res = 300)
# Influence index plot and identify the largest three observations
influenceIndexPlot(lag1reg, 
                   id.n = 3)
dev.off()

png(filename = "Cosseron_FE_influenceindex2.png", 
    width = 3000, 
    height = 3000, 
    res = 300)
influenceIndexPlot(tdlag1reg, 
                   id.n = 3)
dev.off()

png(filename = "Cosseron_FE_influenceindex3.png", 
    width = 3000, 
    height = 3000, 
    res = 300)
influenceIndexPlot(withinreg, 
                   id.n = 3)
dev.off()

# Question 5 ---------------------------------------

# Partial pooling version of the Within model
install.packages("arm", repos = "http://lib.stat.cmu.edu/R/CRAN/")
library(arm)
# (1|state) and (1|year) indicate that we are estimating partially
# pooled intercepts by state and time
withinreg.pp <- lmer(logsales ~ lag1sales + logprice + logndi +  
                  logpimin + (1|state) + (1|year),
                data = cigar.p)
summary(withinreg.pp)

# Comparison with the no pooling model
withinreg <- lm(logsales ~ lag1sales + logprice + logndi +  
                  logpimin + factor(state) + factor(year),
                data = cigar.p)
summary(withinreg)

# Question 8 --------------------------------
# Constructing 1 year ahead forecasts for the Within model and OLS, 
# omitting the last year in the data (1992). 

# Training data drops the last time period
training <- cigar.p[!cigar.p$year == 92, ]
# Test data includes only the last time period
test <- cigar.p[cigar.p$year == 92, ]

# Within model
withinm <- plm(logsales ~ lag1sales + logprice + 
                logndi + logpimin, 
               data = training, 
               model = "within")

# OLS model
olsm <- lm(logsales ~ lag1sales + logprice + 
              logndi + logpimin,
            data = training)

# Predict test data
withinm.pred <- predict(withinm, newdata = test)
olsm.pred <- predict(olsm, newdata = test)

# Calculate the residuals
withinm.resid <- test$logsales - withinm.pred
olsm.resid <- test$logsales - olsm.pred

# Get the rMSE
rMSE.withinm <- mean(withinm.resid^2)
rMSE.olsm <- mean(olsm.resid^2)

rMSE.withinm
rMSE.olsm

# Question 9 --------------------------------
install.packages("sandwich", repos = "http://lib.stat.cmu.edu/R/CRAN/")
install.packages("lmtest", repos = "http://lib.stat.cmu.edu/R/CRAN/")
library(sandwich)
library(lmtest)

# Computing the heteroskedasticity robust errors
coeftest(within,
         vcov = vcovHC(within,
                       method = "white1",
                       type = "HC0",
                       df = df.residual(within)))

# Complete pooling using plm
within.pool <- plm(logsales ~ lag1sales + logprice + logndi +  
                     logpimin + factor(state) + factor(year),
                   data   = cigar.p,
                   model  = "pooling",
                   effect = "individual")

# Computing serial correlation robust standard errors
coeftest(within.pool,
         vcov = function(x) vcovHC(x,
                                   method = "white1",
                                   type = "HC0",
                                   cluster = "group"))

# The end! Thanks for reading :)